home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
BBS in a Box 3
/
BBS in a box - Trilogy III.iso
/
Files
/
Prog
/
B-C
/
C++Source Code Fmtr Folder
/
Src
/
CScanner.cp
< prev
next >
Encoding:
Amiga
Atari
Commodore
DOS
FM Towns/JPY
Macintosh
Macintosh JP
NeXTSTEP
RISC OS/Acorn
UTF-8
Wrap
Text File
|
1991-08-20
|
24.2 KB
|
1,070 lines
|
[
TEXT/MPS
]
/*
** CScanner.cp
*/
#ifndef __CSCANNER__
#include "CScanner.h"
#endif
#ifndef __FORMATTING__
#include "Formatting.h"
#endif
#ifndef __CTYPE__
#include <ctype.h>
#endif
#ifndef __STDIO__
#include <stdio.h>
#endif
#ifndef __STRING__
#include <string.h>
#endif
/*
** TextPtr encapsulates the pointer into the text buffer
*/
typedef const unsigned char* TextPtr;
/*
** class SyntacticLex
** This class overrides SaveCopy by returning itself. All Syntactic objects
** returned by the CScanner are constant and are never modified by any
** inherited methods. If this fails to be true, override this method in the
** derived class
*/
#pragma segment CScanner
class SyntacticLex : public Syntactic {
public:
SyntacticLex(int aType, int aMinorType = 0)
: Syntactic(aType, aMinorType)
{
}
virtual Boolean IsSeparator() const;
/*
** Return true.
*/
virtual const Syntactic *SaveCopy() const;
/*
** The object cannot be modified by any of its methods, so
** return itself.
*/
virtual Boolean Display(Formatting *aFormat) = 0;
/*
** Format and display this syntactic item. The format to use when
** displaying is passed in as its only argument.
*/
};
Boolean SyntacticLex::IsSeparator() const
{
return (false);
}
const Syntactic *SyntacticLex::SaveCopy() const
{
return (this);
}
/*
** class LexicalToken
** This class represents tokens whose body is in the CScanner buffer itself
*/
#pragma segment CScanner
class LexicalToken : public SyntacticLex {
public:
LexicalToken(short aType, TextPtr start)
: SyntacticLex(aType),
fStart(start),
fEnd(start + strlen((char *)start))
{
}
LexicalToken(short aType, short aMinorType, TextPtr start)
: SyntacticLex(aType, aMinorType),
fStart(start),
fEnd(start + strlen((char *)start))
{
}
LexicalToken(short aType, TextPtr start, TextPtr end)
: SyntacticLex(aType),
fStart(start),
fEnd(end)
{
}
LexicalToken(short aType, short aMinorType, TextPtr start, TextPtr end)
: SyntacticLex(aType, aMinorType),
fStart(start),
fEnd(end)
{
}
virtual Boolean Display(Formatting *aFormat);
/*
** Display the token using the formatting information given by
** aFormat
*/
protected:
TextPtr fStart;
TextPtr fEnd;
};
Boolean LexicalToken::Display(Formatting *aFormat)
{
aFormat->Write((char *)fStart, fEnd - fStart);
return (fEnd != fStart);
}
/*
** class PredefinedToken
** Predefined tokens which never change their form
*/
#pragma segment CScanner
class PredefinedToken : public LexicalToken {
public:
PredefinedToken(short aType, const char *aString)
: LexicalToken(aType, (TextPtr)aString)
{
}
PredefinedToken(short aType, const char *aString, short aMinorType)
: LexicalToken(aType, aMinorType, (TextPtr)aString)
{
}
PredefinedToken(const char *aString, short aType)
: LexicalToken(aType, (TextPtr)aString)
{
}
PredefinedToken(const char *aString, short aType, short aMinorType)
: LexicalToken(aType, aMinorType, (TextPtr)aString)
{
}
const char *String() const
{
return ((const char *)fStart);
}
// Return the string in the token itself
};
/*
** class ReservedWord:
** Reserved words are a subclass of PredefinedTokens.
*/
#pragma segment CScanner
class ReservedWord : public PredefinedToken {
public:
ReservedWord(short aType, const char *aString)
: PredefinedToken(aType, aString)
{
}
ReservedWord(short aType, const char *aString, short aMinorType)
: PredefinedToken(aType, aString, aMinorType)
{
}
ReservedWord(const char *aString, short aType)
: PredefinedToken(aType, aString)
{
}
ReservedWord(const char *aString, short aType, short aMinorType)
: PredefinedToken(aType, aString, aMinorType)
{
}
};
/*
** class OperatorToken:
** Operators are a subclass of PredefinedTokens.
*/
#pragma segment CScanner
class OperatorToken : public PredefinedToken {
public:
OperatorToken(short aType, const char *aString)
: PredefinedToken(aType, aString)
{
}
OperatorToken(short aType, const char *aString, short aMinorType)
: PredefinedToken(aType, aString, aMinorType)
{
}
OperatorToken(const char *aString, short aType)
: PredefinedToken(aType, aString)
{
}
OperatorToken(const char *aString, short aType, short aMinorType)
: PredefinedToken(aType, aString, aMinorType)
{
}
};
/*
** class WhiteSpaceToken:
** This class encapsulates those items which are white space (non-separators)
** but which the formatter is concerned about. These include comments, newlines
** preprocessor lines, etc.
*/
#pragma segment CScanner
class WhiteSpaceToken : public SyntacticLex {
public:
WhiteSpaceToken(int aType)
: SyntacticLex(aType)
{
}
virtual Boolean IsSeparator() const;
/*
** Return false.
*/
virtual Boolean Display(Formatting *aFormat) = 0;
/*
** Format and display this syntactic item. The format to use when
** displaying is passed in as its only argument.
*/
};
Boolean WhiteSpaceToken::IsSeparator() const
{
return (true);
}
/*
** class NewLineToken
** This class encapsulates source new lines. Source new lines may or may
** not be passed through to the output depending on what the user has said
** about new line preservation
*/
#pragma segment CScanner
class NewLineToken : public WhiteSpaceToken {
public:
NewLineToken(short aType)
: WhiteSpaceToken(aType)
{
}
virtual Boolean Display(Formatting *aFormat);
};
Boolean NewLineToken::Display(Formatting *aFormat)
{
if (Type() == kSLex_Null)
aFormat->Putc('\\');
aFormat->NewLine();
return (true);
}
/*
** class CommentToken
** This class contains comments. Comments are separators and are formatted
** by the Formatting class. For implementation purposes, "# preprocessor"
** lines are also considered comments.
*/
#pragma segment CScanner
class CommentToken : public WhiteSpaceToken {
public:
CommentToken(short aType, TextPtr start, TextPtr end)
: WhiteSpaceToken(aType),
fStart(start),
fEnd(end)
{
// Check if this is a formatting type of comment. If it is, set the
// minor type to the requisite type of change. A "formatting off"
// comment is of the form "//ƒ-" or "/*ƒ-", while a "formatting on"
// comment is of the form "//ƒ+" or "/*ƒ+"
if (start[2] == (unsigned char)'ƒ')
if (start[3] == '+')
MinorSexChange(kSLex_CommentFormatOn);
else if (start[3] == '-')
MinorSexChange(kSLex_CommentFormatOff);
}
virtual Boolean Display(Formatting *aFormat);
/*
** Make the Formatting* do the work
*/
private:
TextPtr fStart;
TextPtr fEnd;
};
Boolean CommentToken::Display(Formatting *aFormat)
{
aFormat->ExecuteGlue((FormatString)"s#");
switch (Type()) {
case kSLex_Comment:
aFormat->Comment((const char *)fStart, (const char *)fEnd);
break;
case kSLex_PoundLine:
/*
** Assure that the preprocessor line starts on a fresh line
*/
aFormat->ExecuteGlue((FormatString)"&n");
// !!! FALL THROUGH !!!
default:
aFormat->Write((char *)fStart, fEnd - fStart);
break;
}
return (fStart != fEnd);
}
/*ƒ-
** Predefined token types
*/
static unsigned char gErrText[] = "——— Error ———";
static LexicalToken gErr (kSErr, gErrText, gErrText+13);
static NewLineToken gNewLine (kSLex_NewLine);
static NewLineToken gEOF (kSLex_EOF);
static NewLineToken gContinuation(kSLex_Null);
static PredefinedToken gClassColon (kSLex_ClassColon, "::");
static PredefinedToken gColon (kSLex_Colon, ":");
static PredefinedToken gSemiColon (kSLex_SemiColon, ";");
static PredefinedToken gLParen (kSLex_LParen, "(");
static PredefinedToken gRParen (kSLex_RParen, ")");
static PredefinedToken gLBrace (kSLex_LBrace, "[");
static PredefinedToken gRBrace (kSLex_RBrace, "]");
static PredefinedToken gLCurly (kSLex_LCurly, "{");
static PredefinedToken gRCurly (kSLex_RCurly, "}");
static PredefinedToken gComma (kSLex_Comma, ",");
static PredefinedToken gEllipsis (kSLex_Ellipsis, "...");
/*
** Define the operators
*/
static OperatorToken gClassStar (kSLex_Op, "::*", kSLex_OpClassStar);
static OperatorToken gQuestion (kSLex_Op, "?", kSLex_OpQuestion);
static OperatorToken gPeriod (kSLex_Op, ".", kSLex_OpDot);
static OperatorToken gPeriodStar (kSLex_Op, ".*", kSLex_OpDotStar);
static OperatorToken gAdd (kSLex_Op, "+", kSLex_OpAdd);
static OperatorToken gSub (kSLex_Op, "-", kSLex_OpSub);
static OperatorToken gMul (kSLex_Op, "*", kSLex_OpMul);
static OperatorToken gDiv (kSLex_Op, "/", kSLex_OpDiv);
static OperatorToken gMod (kSLex_Op, "%", kSLex_OpMod);
static OperatorToken gXor (kSLex_Op, "^", kSLex_OpBXor);
static OperatorToken gLNot (kSLex_Op, "!", kSLex_OpLNot);
static OperatorToken gLAnd (kSLex_Op, "&&", kSLex_OpLAnd);
static OperatorToken gLOr (kSLex_Op, "||", kSLex_OpLOr);
static OperatorToken gBNot (kSLex_Op, "~", kSLex_OpBNot);
static OperatorToken gBAnd (kSLex_Op, "&", kSLex_OpBAnd);
static OperatorToken gBOr (kSLex_Op, "|", kSLex_OpBOr);
static OperatorToken gLSH (kSLex_Op, "<<", kSLex_OpLSh);
static OperatorToken gRSH (kSLex_Op, ">>", kSLex_OpRSh);
static OperatorToken gAssign (kSLex_Op, "=", kSLex_OpAssign);
static OperatorToken gLT (kSLex_Op, "<", kSLex_OpLT);
static OperatorToken gLE (kSLex_Op, "<=", kSLex_OpLE);
static OperatorToken gEQ (kSLex_Op, "==", kSLex_OpEQ);
static OperatorToken gNE (kSLex_Op, "!=", kSLex_OpNE);
static OperatorToken gGE (kSLex_Op, ">=", kSLex_OpGE);
static OperatorToken gGT (kSLex_Op, ">", kSLex_OpGT);
static OperatorToken gAddAssign (kSLex_Op, "+=", kSLex_OpAssignAdd);
static OperatorToken gSubAssign (kSLex_Op, "-=", kSLex_OpAssignSub);
static OperatorToken gMulAssign (kSLex_Op, "*=", kSLex_OpAssignMul);
static OperatorToken gDivAssign (kSLex_Op, "/=", kSLex_OpAssignDiv);
static OperatorToken gModAssign (kSLex_Op, "%=", kSLex_OpAssignMod);
static OperatorToken gXorAssign (kSLex_Op, "^=", kSLex_OpAssignBXor);
static OperatorToken gBAndAssign (kSLex_Op, "&=", kSLex_OpAssignBAnd);
static OperatorToken gBOrAssign (kSLex_Op, "|=", kSLex_OpAssignBOr);
static OperatorToken gLSHAssign (kSLex_Op, "<<=", kSLex_OpAssignLSh);
static OperatorToken gRSHAssign (kSLex_Op, ">>=", kSLex_OpAssignRSh);
static OperatorToken gDecr (kSLex_Op, "--", kSLex_OpMinusMinus);
static OperatorToken gIncr (kSLex_Op, "++", kSLex_OpPlusPlus);
static OperatorToken gPointer (kSLex_Op, "->", kSLex_OpPointer);
static OperatorToken gPointerStar (kSLex_Op, "->*", kSLex_OpPointerStar);
/*ƒ-
** Reserved words
*/
static ReservedWord gAuto ("auto", kSLex_Decl);
static ReservedWord gBreak ("break", kSLex_Break, kSLex_BreakBreak);
static ReservedWord gCase ("case", kSLex_Case);
static ReservedWord gChar ("char", kSLex_Decl);
static ReservedWord gClass ("class", kSLex_Struct, kSLex_StructClass);
static ReservedWord gConst ("const", kSLex_Decl, kSLex_DeclConst);
static ReservedWord gContinue ("continue", kSLex_Break, kSLex_BreakContinue);
static ReservedWord gDefault ("default", kSLex_Default);
static ReservedWord gDo ("do", kSLex_Do);
static ReservedWord gDouble ("double", kSLex_Decl);
static ReservedWord gElse ("else", kSLex_Else);
static ReservedWord gEnum ("enum", kSLex_Struct, kSLex_StructEnum);
static ReservedWord gExtended ("extended", kSLex_Decl);
static ReservedWord gExtern ("extern", kSLex_Decl);
static ReservedWord gFor ("for", kSLex_For);
static ReservedWord gFloat ("float", kSLex_Decl);
static ReservedWord gFriend ("friend", kSLex_Decl);
static ReservedWord gGoto ("goto", kSLex_Break, kSLex_BreakGoto);
static ReservedWord gIf ("if", kSLex_If);
static ReservedWord gInline ("inline", kSLex_Decl);
static ReservedWord gInt ("int", kSLex_Decl);
static ReservedWord gLong ("long", kSLex_Decl);
static ReservedWord gOperator ("operator", kSLex_DeclOperator, kSLex_DeclOperator);
static ReservedWord gPascal ("pascal", kSLex_Decl);
static ReservedWord gPrivate ("private", kSLex_Public, kSLex_PublicPrivate);
static ReservedWord gProtected ("protected", kSLex_Public, kSLex_PublicProtected);
static ReservedWord gPublic ("public", kSLex_Public, kSLex_PublicPublic);
static ReservedWord gRegister ("register", kSLex_Decl);
static ReservedWord gReturn ("return", kSLex_Break, kSLex_BreakReturn);
static ReservedWord gShort ("short", kSLex_Decl);
static ReservedWord gSigned ("signed", kSLex_Decl);
static ReservedWord gStatic ("static", kSLex_Decl);
static ReservedWord gStruct ("struct", kSLex_Struct, kSLex_StructStruct);
static ReservedWord gSwitch ("switch", kSLex_Switch);
static ReservedWord gTemplate ("template", kSLex_Struct, kSLex_StructTemplate);
static ReservedWord gTypedef ("typedef", kSLex_Decl);
static ReservedWord gUnion ("union", kSLex_Struct, kSLex_StructUnion);
static ReservedWord gUnsigned ("unsigned", kSLex_Decl);
static ReservedWord gVa_dcl ("va_dcl", kSLex_Decl);
static ReservedWord gVirtual ("virtual", kSLex_Decl);
static ReservedWord gVoid ("void", kSLex_Decl);
static ReservedWord gVolatile ("volatile", kSLex_Decl, kSLex_DeclVolatile);
static ReservedWord gWhile ("while", kSLex_While);
//ƒ+
#pragma segment CScanner
static Syntactic *lookup(TextPtr start, TextPtr end)
{
int len = end - start;
// These are the bounds on the lengths of the words in the reservedWordList
const int kMinWordLength = 2;
const int kMaxWordLength = 9;
//ƒ- Note that this list is in alphabetical order
static ReservedWord* reservedWordList[] = {
&gAuto
, &gBreak
, &gCase
, &gChar
, &gClass
, &gConst
, &gContinue
, &gDefault
, &gDo
, &gDouble
, &gElse
, &gEnum
, &gExtended
, &gExtern
, &gFloat
, &gFor
, &gFriend
, &gGoto
, &gIf
, &gInline
, &gInt
, &gLong
, &gOperator
, &gPascal
, &gPrivate
, &gProtected
, &gPublic
, &gRegister
, &gReturn
, &gShort
, &gSigned
, &gStatic
, &gStruct
, &gSwitch
, &gTemplate
, &gTypedef
, &gUnion
, &gUnsigned
, &gVa_dcl
, &gVirtual
, &gVoid
, &gVolatile
, &gWhile
};
//ƒ+
if (len <= kMaxWordLength && len >= kMinWordLength) {
int min = 0;
int max = sizeof(reservedWordList) / sizeof(reservedWordList[0]) - 1;
while (min <= max) {
int mid = (min + max) / 2;
int cmp = strncmp((char *)start, reservedWordList[mid]->String(), len);
// fprintf(stderr, "reservedWordList[%2d] = %s\n", mid, reservedWordList[mid]->String());
if (cmp == 0)
cmp = strlen(reservedWordList[mid]->String()) - len;
if (cmp == 0)
return (reservedWordList[mid]);
else if (cmp < 0)
max = mid - 1;
else
min = mid + 1;
}
}
return (new LexicalToken(kSLex_Id, start, end));
}
/*
** w1
** Return a token corresponding to the single character passed it. The assign
** arg is true if this is the special case of op=
*/
#pragma segment CScanner
static Syntactic *w1(int aChar, Boolean assign = false)
{
//ƒ-
switch (aChar) {
case '+': return (assign ? &gAddAssign :&gAdd);
case '-': return (assign ? &gSubAssign :&gSub);
case '*': return (assign ? &gMulAssign :&gMul);
case '/': return (assign ? &gDivAssign :&gDiv);
case '%': return (assign ? &gModAssign :&gMod);
case '^': return (assign ? &gXorAssign :&gXor);
case '!': return (assign ? &gNE :&gLNot);
case '&': return (assign ? &gBAndAssign :&gBAnd);
case '|': return (assign ? &gBOrAssign :&gBOr);
case '=': return (assign ? &gEQ :&gAssign);
case '<': return (assign ? &gLE :&gLT);
case '>': return (assign ? &gGE :&gGT);
case '~': return (&gBNot);
case ':': return (&gColon);
case '.': return (&gPeriod);
case '?': return (&gQuestion);
case '\0': return (&gEOF);
default: return (&gErr);
}
//ƒ+
}
// w2
#pragma segment CScanner
static Syntactic *w2(int aChar)
{
//ƒ-
switch (aChar) {
case '+': return (&gIncr);
case '-': return (&gDecr);
case '&': return (&gLAnd);
case '|': return (&gLOr);
case '<': return (&gLSH);
case '>': return (&gRSH);
case '=': return (&gEQ);
case ':': return (&gClassColon);
default: return (&gErr);
}
//ƒ+
}
#pragma segment CScanner
inline Boolean isodigit(int ch)
{
return ((ch >= '0') && (ch <= '7'));
}
// CScanner::ICScanner
#pragma segment CScanner
short CScanner::ICScanner()
{
return (noErr);
}
// CScanner::NextToken
#pragma segment CScanner
Syntactic *CScanner::NextToken()
{
int ch;
int firstCh;
TextPtr start;
// End of file check
if (fBuffer >= fBufferEnd) {
return (&gEOF);
}
// Skip over the white space (0xCA is the non-breaking space)
while (fBuffer < fBufferEnd) {
ch = NextChar();
if (ch == '\n') {
fLastTokenStart = fBuffer - 1;
return (&gNewLine);
}
if (!isspace(ch))
if (ch != (unsigned char)0xCA)
break;
}
// End of file check
if (fBuffer == fBufferEnd && isspace(ch)) {
return (&gEOF);
}
// Remember the start of the token
fLastTokenStart = start = fBuffer - 1;
// Identifier scanning
if (isalpha(ch) || (ch == '_')) {
while (isalnum(ch) || (ch == '_') || (ch == '%') || (ch == '$'))
ch = NextChar();
if (!IsEOF())
PushBack();
return (lookup(start, fBuffer));
}
// Number scanning
if (isdigit(ch)) {
if (ch == '0') {
ch = NextChar();
if (ch == 'x' || ch == 'X') {
ch = NextChar();
while (isxdigit(ch))
ch = NextChar();
goto numDone;
}
}
while (isdigit(ch))
ch = NextChar();
if (ch == '.') {
ch = NextChar();
if (ch == '.') {
PushBack(); // '..' range symbol pushed back
PushBack();
goto numDone;
}
while (isdigit(ch))
ch = NextChar();
if (ch == 'e' || ch == 'E') {
ch = NextChar();
if (ch == '+' || ch == '-')
ch = NextChar();
if (!isdigit(ch))
goto numDone;
while (isdigit(ch))
ch = NextChar();
}
}
numDone:;
// Allow the optional type marker.
if (ch == 'l' || ch == 'L')
ch = NextChar();
if (!IsEOF())
PushBack();
return (new LexicalToken(kSLex_Value, start, fBuffer));
}
firstCh = ch;
switch (firstCh) {
//ƒ-
case ';': return (&gSemiColon);
case '(': return (&gLParen);
case ')': return (&gRParen);
case '{': return (&gLCurly);
case '}': return (&gRCurly);
case '[': return (&gLBrace);
case ']': return (&gRBrace);
case ',': return (&gComma);
case '?': return (&gQuestion);
//ƒ+
case '"':
case '\'':
for (;;) {
ch = NextChar();
if (ch == '\\')
NextChar();
else if (IsEOF() || (ch == firstCh))
break;
}
return (new LexicalToken(kSLex_Value, start, fBuffer));
//break;
case ':':
ch = NextChar();
if (ch == ':') {
if (NextChar() == '*')
return (&gClassStar);
PushBack();
return (&gClassColon);
} else {
PushBack();
return (&gColon);
}
break;
case '*':
case '%':
case '^':
case '!':
ch = NextChar();
if (ch == '=')
return (w1(firstCh, true));
else {
PushBack();
return (w1(firstCh));
}
break;
case '+':
case '&':
case '|':
case '=':
ch = NextChar();
if (ch == '=')
return (w1(firstCh, true));
else if (ch == firstCh)
return (w2(firstCh));
else {
PushBack();
return (w1(firstCh));
}
break;
case '-':
ch = NextChar();
if (ch == '=')
return (&gSubAssign);
else if (ch == '-')
return (&gDecr);
else if (ch == '>') {
ch = NextChar();
if (ch == '*')
return (&gPointerStar);
else {
PushBack();
return (&gPointer);
}
} else {
PushBack();
return (&gSub);
}
break;
case '<':
case '>':
ch = NextChar();
if (ch == '=')
return (w1(firstCh, true));
else if (ch == firstCh) {
ch = NextChar();
if (ch == '=')
return ((firstCh == '<') ? &gLSHAssign : &gRSHAssign);
else {
PushBack();
return (w2(firstCh));
}
} else {
PushBack();
return (w1(firstCh));
}
break;
case '/':
ch = NextChar();
switch (ch) {
case '/':
// Copy the comment up to but not including the '\n'
do
ch = NextChar();
while ((ch > 0) && (ch != '\n'));
if (ch == '\n')
PushBack();
return (new CommentToken(kSLex_Comment, start, fBuffer));
//break;
case '*':
while (ch > 0) {
ch = NextChar();
if (ch == '*') {
while (ch == '*')
ch = NextChar();
if (ch == '/')
break;
}
}
return (new CommentToken(kSLex_Comment, start, fBuffer));
//break;
case '=':
return (&gDivAssign);
//break;
default:
PushBack();
return (&gDiv);
}
break;
case '.':
ch = NextChar();
if (ch == '*')
return (&gPeriodStar);
else if (ch == '.') {
if (NextChar() == '.')
return (&gEllipsis);
PushBack();
}
PushBack();
return (&gPeriod);
// break;
case '#':
{
int type = 0; // Assume other type
enum {
kStart // Look for "if", "elif", "else"
, kText // Normal text
, kComment // Within a "/*" comment
, kCommentToEOL // Within a "//" comment
, kString // Within a string
, kChar // Within a character constant
} state = kStart;
/*
** State transitions:
** kStart -> kText
** kText -> {kComment, kString, kChar}
** kComment -> kText
** kString -> kText
** kChar -> kText
**
** Note that we don't allow comments immediately following the
** "#". If this is a problem, extend the state machine by adding
** kStartComment, which commutes with kStart. For now, we don't
** care.
** The only reason we have a state machine here at all is because
** there is some code extent which has a multi-line comment at the
** end of a #define without line continuation characters at the end.
*/
for (;;) {
// Get the next character. Strings, character constants and
// line continuations use "\" to strop the character following.
ch = NextChar();
if (ch == '\\') {
NextChar();
continue;
}
// Blanks don't matter, but newlines do.
if (isspace(ch) && ch != '\n')
continue;
// EOF is fatal in all states
if (IsEOF()) {
Syntactic * aToken = new CommentToken(kSLex_PoundLine, start, fBuffer);
aToken->MinorSexChange(type);
return (aToken);
}
switch (state) {
case kStart:
// Determine what type of "#" this is. Following this is text.
state = kText;
if (strncmp((char *)fBuffer - 1, "if", 2) == 0)
type = kSLex_PoundIf;
else if (strncmp((char *)fBuffer - 1, "elif", 4) == 0)
type = kSLex_PoundElif;
else if (strncmp((char *)fBuffer - 1, "else", 4) == 0)
type = kSLex_PoundElse;
else if (strncmp((char *)fBuffer - 1, "endif", 5) == 0)
type = kSLex_PoundEndIf;
else
// Do this character as text.
goto doText;
break;
case kText:
// Switch to one of the subordinate states.
doText: switch (ch) {
case '"':
state = kString;
break;
case '\'':
state = kChar;
break;
case '/':
switch (NextChar()) {
case '*':
state = kComment;
break;
case '/':
state = kCommentToEOL;
break;
default:
PushBack();
break;
}
break;
case '\\':
// End of line continuation character? Ignore the character
// following as if it is a newline the continuation hides
// it and if it isn't a newline it would be incorporated
// as part of the token.
NextChar();
break;
case '\n':
{
PushBack();
Syntactic * aToken = new CommentToken(kSLex_PoundLine, start, fBuffer);
aToken->MinorSexChange(type);
return (aToken);
}
}
break;
case kComment:
// Check if this is the end of the comment
if (ch == '*')
if (NextChar() == '/')
state = kText;
else
PushBack();
break;
case kCommentToEOL:
// Keep going to the end of the comment line. If there is
// a continuation character, check if the character following
// is a newline. If it is, the state becomes kText. If the
// character is a newline, then change state to kText and let
// doText do the work. Otherwise ignore the character
if (ch == '\\') {
if (NextChar() == '\n')
state = kText;
} else if (ch == '\n') {
state = kText;
goto doText;
}
break;
case kString:
if (ch == '"')
state = kText;
break;
case kChar:
if (ch == '\'')
state = kText;
break;
}
}
}
break;
case '\\':
ch = NextChar();
if (ch == '\n')
return (&gContinuation);
else {
PushBack();
return (&gErr);
}
break;
default:
return (w1(firstCh));
}
}
// CScanner::LineNumber
#pragma segment CScanner
size_t CScanner::LineNumber() const
{
size_t lineNo = 1;
const unsigned char *aPtr = fBufferStart;
while (aPtr <= fLastTokenStart)
if (*aPtr++ == '\n')
lineNo++;
return (lineNo);
}